🤔
爬虫介绍
- 爬虫是一种按照一定的规则,自动地抓取万维网信息的程序或者脚本(来自于百度百科)。
爬虫的目的
- 爬取数据,进行市场调研和商业分析。
- 作为机器学习、数据挖掘的原始数据
- 爬取优质的资源
爬虫实现过程
- 大部分爬虫都是按“发送请求——获得页面——解析页面——抽取并储存内容”这样的流程来进行,这其实也是模拟了我们使用浏览器获取网页信息的过程。
python中的相关包
- 在python中有很多爬虫可用的包,如:urllib、requests、bs4、scrapy、pyspider 等
简单的爬虫实例
- 以下是一个简单的python爬虫,爬取前程无忧上成都unity3d的人才需求:
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193import os
from pprint import pprint
import csv
from collections import Counter
from bs4 import BeautifulSoup
import requests
import matplotlib.pyplot as plt
import jieba
from wordcloud import WordCloud
class JobSpider:
def __init__(self):
self.company = []
self.text = ""
self.headers = {'X-Requested-With': 'XMLHttpRequest','User-Agent': 'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/56.0.2924.87 Safari/537.36'
}
def job_spider(self):
""" 爬虫入口
"""
#要抓取的页面
url = "https://search.51job.com/list/090200,000000,0000,00,9,99,Unity3d,2,1.html?lang=c&stype=&postchannel=0000&workyear=99&cotype=99°reefrom=99&jobterm=99&companysize=99&providesalary=99&lonlat=0%2C0&radius=-1&ord_field=0&confirmdate=9&fromType=&dibiaoid=0&address=&line=&specialarea=00&from=&welfare="
urls = [url.format(p) for p in range(1, 100)]
for url in urls:
#请求网页
r = requests.get(url, headers=self.headers).content
#解析
bs = BeautifulSoup(r, 'lxml').find("div", class_="dw_table").find_all("div", class_="el")
for b in bs:
try:
href, post = b.find('a')['href'], b.find('a')['title']
locate = b.find('span', class_='t3').text
salary = b.find('span', class_='t4').text
d = {
'href': href,
'post': post,
'locate': locate,
'salary': salary
}
self.company.append(d)
except Exception:
pass
def post_require(self):
""" 爬取职位描述
"""
for c in self.company:
r = requests.get(
c.get('href'), headers=self.headers).content.decode('gbk')
bs = BeautifulSoup(r, 'lxml').find(
'div', class_="bmsg job_msg inbox").text
s = bs.replace("举报", "").replace("分享", "").replace("\t", "").strip()
self.text += s
print(self.text)
with open(os.path.join("data", "post_require.txt"), "w+") as f:
f.write(self.text)
@staticmethod
def post_desc_counter():
""" 职位描述统计
"""
post = open(os.path.join("data", "post_require.txt"),
"r").read()
# 使用 jieba 分词
file_path = os.path.join("data", "user_dict.txt")
jieba.load_userdict(file_path)
seg_list = jieba.cut(post, cut_all=False)
counter = dict()
for seg in seg_list:
counter[seg] = counter.get(seg, 1) + 1
counter_sort = sorted(
counter.items(), key=lambda value: value[1], reverse=True)
pprint(counter_sort)
with open(os.path.join("data", "post_pre_desc_counter.csv"), "w+", encoding="utf-8") as f:
f_csv = csv.writer(f)
f_csv.writerows(counter_sort)
def post_counter(self):
""" 职位统计
"""
lst = [c.get('post') for c in self.company]
counter = Counter(lst)
counter_most = counter.most_common()
pprint(counter_most)
with open(os.path.join("data", "post_pre_counter.csv"),
"w+", encoding="utf-8") as f:
f_csv = csv.writer(f)
f_csv.writerows(counter_most)
def post_salary_locate(self):
""" 招聘大概信息,职位,薪酬以及工作地点
"""
lst = []
for c in self.company:
lst.append((c.get('salary'), c.get('post'),c.get('title'), c.get('href'), c.get('locate')))
pprint(lst)
file_path = os.path.join("data", "post_salary_locate.csv")
with open(file_path, "w+") as f:
f_csv = csv.writer(f)
f_csv.writerows(lst)
@staticmethod
def post_salary():
""" 薪酬统一处理
"""
mouth = []
year = []
thousand = []
with open(os.path.join("data", "post_salary_locate.csv"),
"r", encoding="utf-8") as f:
f_csv = csv.reader(f)
for row in f_csv:
if "万/月" in row[0]:
mouth.append((row[0][:-3], row[2], row[1]))
elif "万/年" in row[0]:
year.append((row[0][:-3], row[2], row[1]))
elif "千/月" in row[0]:
thousand.append((row[0][:-3], row[2], row[1]))
# pprint(mouth)
calc = []
for m in mouth:
s = m[0].split("-")
calc.append(
(round(
(float(s[1]) - float(s[0])) * 0.4 + float(s[0]), 1),
m[1], m[2]))
for y in year:
s = y[0].split("-")
calc.append(
(round(
((float(s[1]) - float(s[0])) * 0.4 + float(s[0])) / 12, 1),
y[1], y[2]))
for t in thousand:
s = t[0].split("-")
calc.append(
(round(
((float(s[1]) - float(s[0])) * 0.4 + float(s[0])) / 10, 1),
t[1], t[2]))
pprint(calc)
with open(os.path.join("data", "post_salary.csv"),
"w+", encoding="utf-8") as f:
f_csv = csv.writer(f)
f_csv.writerows(calc)
@staticmethod
def post_salary_counter():
# 薪酬统计
with open(os.path.join("data", "post_salary.csv"),
"r", encoding="utf-8") as f:
f_csv = csv.reader(f)
lst = [row[0] for row in f_csv]
counter = Counter(lst).most_common()
pprint(counter)
with open(os.path.join("data", "post_salary_counter1.csv"),
"w+", encoding="utf-8") as f:
f_csv = csv.writer(f)
f_csv.writerows(counter)
@staticmethod
def world_cloud():
""" 生成词云
"""
counter = {}
with open(os.path.join("data", "post_pre_desc_counter.csv"),
"r", encoding="utf-8") as f:
f_csv = csv.reader(f)
for row in f_csv:
counter[row[0]] = counter.get(row[0], int(row[1]))
pprint(counter)
file_path = os.path.join("font", "msyh.ttf")
wc = WordCloud(font_path=file_path,
max_words=100,
height=600,
width=1200)
wc.generate_from_frequencies(counter)
plt.imshow(wc)
plt.axis('off')
plt.show()
wc.to_file(os.path.join("images", "wc.jpg"))
if __name__ == "__main__":
spider = JobSpider()
spider.job_spider()
spider.post_salary_locate()
spider.post_salary()
spider.post_salary_counter()
spider.post_counter()
spider.world_cloud()